In [1]:
url_list = ["https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/1-2000",
"https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/2001-4000",
"https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/4001-6000",
"https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/6001-8000",
"https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/8001-10000"
]
In [2]:
from urllib.request import urlopen
from lxml import html
import unicodedata
In [3]:
french_words = list()
french_words_set = set()
In [4]:
# Function to strip accents
def strip_accents(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
# Function to process a word
def process_french_word(word):
return strip_accents(word.lower())
In [5]:
for url in url_list:
page_html = urlopen(url).read()
tree = html.fromstring(page_html)
word_list = tree.xpath('.//div/table//tr//li/span/a')
for w in word_list:
word = w.text
proc_word = process_french_word(word)
if proc_word not in french_words_set:
french_words_set.add(proc_word)
french_words.append(proc_word)
In [6]:
# Write words to a text file
f_out = open("french.txt", 'w')
for word in french_words:
f_out.write(word+"\n")
f_out.close()
In [ ]: